# Importing libraries
library(dplyr)
library(ggplot2)
library(stringr)
library(gridExtra)
library(outliers)
library(PerformanceAnalytics)
library(foreach)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:dplyr’:
first, last
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
# turn off warnings
options(warn=-1)
# importing the data
df <- read.csv('../data/data.csv')
# function to plot
plots <- function(dataset, col, fw=FALSE, hist='default',
density='default' , bins='default',
xtick_angles='default', sep=FALSE, savefig='default', filename='./plot.png') {
var <- dataset %>% dplyr::select(col)
if (bins == 'default') {bins <- rep(10,2)}
if (xtick_angles == 'default') {xtick_angles <- rep(90,2)}
if (hist == 'default') {hist <- c(FALSE,FALSE)}
if (density == 'default') {density <- c(TRUE,TRUE)}
if (savefig == 'default') {savefig <- c(FALSE,14,14)}
p1 <- dataset %>% ggplot(aes(x=var[,1])) +
geom_boxplot() +
ggtitle(str_interp("${col}")) +
theme(axis.title.x=element_blank(),axis.text.y=element_blank())
p2 <- dataset %>% ggplot(aes(x=var[,1], fill=hdi_cat)) +
geom_boxplot() +
ggtitle(str_interp("${col} grouped by HDI")) +
theme(axis.title.x=element_blank(),axis.text.y=element_blank())
p3 <- dataset %>% ggplot(aes(x=var[,1])) +
ggtitle(str_interp("${col}")) +
theme(axis.title.x=element_blank(),
axis.text.x = element_text(angle = xtick_angles[1]))
p4 <- dataset %>% ggplot(aes(x=var[,1])) +
ggtitle(str_interp("${col} by HDI group")) +
theme(axis.title.x=element_blank(),
axis.text.x = element_text(angle = xtick_angles[2]))
if (hist[1] == TRUE) {
p3 <- p3 + geom_histogram(aes(y=..density..),bins=bins[1])}
if (hist[2] == TRUE) {
p4 <- p4 + geom_histogram(show.legend = FALSE,bins=bins[2],
aes(fill=hdi_cat,y=..density..))}
if (density[1] == TRUE) {
p3 <- p3 + geom_density()}
if (density[2] == TRUE) {
p4 <- p4 + geom_density(aes(group=hdi_cat,colour=hdi_cat,fill=hdi_cat))}
if (fw == TRUE) {p4 <- p4 + facet_wrap(~hdi_cat, nrow = 1)}
if (sep == TRUE) {
grid.arrange(p1,p2, nrow=2)
grid.arrange(p3,p4, nrow=2)}
else {grid.arrange(p1,p2,p3,p4, nrow=4)}
if (savefig[1] == TRUE) {ggsave(file=filename, width=savefig[2], height=savefig[3],
arrangeGrob(p1,p2,p3,p4, nrow=4))}
}
# Helper function to colour num. variables by cat. variables
colors <- function(cat_var, colors_vector) {
kleuren <- as.numeric(as.factor(cat_var))
foreach (i=1:length(kleuren), kleur=kleuren) %do% {
kleuren[i] = colors_vector[kleur]
}
return(kleuren)
}
Method of calculation (from wikipedia):

# setting colnames for variables to use in the analysis
cols = names(df)
cols = cols[6:(length(cols)-2)]
cols
# Selecting colours per HDI
color_1 <- "blueviolet"
color_2 <- "red"
color_3 <- "black"
color_4 <- "green"
palette <- c(color_1,color_2,color_3,color_4)
hdi_colours <- colors(df$hdi_cat,palette)
# setting plot size
options(repr.plot.width=12, repr.plot.height=14)
plots(dataset=df, col=cols[1], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), xtick_angles=c(50,50) ,bins=c(30,30), fw=TRUE, sep=FALSE)
df$country_name[df$foreign_inv_inflows > 10014783649.551354]
There's several extreme outliers for GDP, not only in general but also per group and most values concentrate around a specific range. The highest gdp countries
quantile(df$gdp)
Our main, general outliers are the following:
df$country_name[df$gdp>1000000000000]
plots(dataset=df, col=cols[2], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[3], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[4], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=TRUE, sep=FALSE, savefig=c(TRUE,12,12), filename='education_years.png')
plots(dataset=df, col=cols[5], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[6], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=FALSE, sep=FALSE)
plots(dataset=df, col=cols[7], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[8], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[9], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[10], hist=c(TRUE,TRUE), density=c(TRUE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
sd(df$perc_internet_users[df$hdi_cat == "very high"])
plots(dataset=df, col=cols[11], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[12], hist=c(TRUE,TRUE), density=c(TRUE,FALSE), bins=c(20,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[13], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=FALSE, sep=FALSE)
plots(dataset=df, col=cols[14], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[15], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[16], hist=c(TRUE,FALSE), density=c(FALSE,TRUE), bins=c(30,30), fw=FALSE, sep=FALSE)
plots(dataset=df, col=cols[17], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=FALSE, sep=FALSE)
plots(dataset=df, col=cols[18], hist=c(TRUE,TRUE), density=c(FALSE,FALSE), bins=c(30,30), fw=TRUE, sep=FALSE)
plots(dataset=df, col=cols[19], hist=c(TRUE,FALSE), density=c(TRUE,TRUE), bins=c(30,30), fw=FALSE, sep=FALSE)
# setting plot size
options(repr.plot.width=14, repr.plot.height=14)
pa <- df %>% dplyr::select(cols)
chart.Correlation(pa, histogram=TRUE, pch=19, method="pearson")
unique(cbind(df$hdi_cat, hdi_colours))
| hdi_colours | |
|---|---|
| high | blueviolet |
| medium | black |
| very high | green |
| low | red |
pairs(pa,pch=1,col=hdi_colours)
methods = c('kendall','spearman','pearson')
corr_mat = matrix(rep(0,(length(cols)^2)*4), nrow=length(cols)^2)
corr_mat = corr_mat %>% data.frame() %>% setNames(c('var1','var2','coef','corr_type'))
cnt = 0
for (i in 1:length(cols)) {
for (j in 1:length(cols)) {
cnt = cnt + 1
comb1 <- df %>% select(cols[i])
comb2 <- df %>% select(cols[j])
maximum_cor = 0
method_used = ''
for (method in methods) {
correl <- cor(comb1[,1],comb2[,1], method=method)
if (abs(correl) > abs(maximum_cor)) {
maximum_cor <- correl
method_used = method
}
}
corr_mat$coef[cnt] = maximum_cor
corr_mat$var1[cnt] = cols[i]
corr_mat$var2[cnt] = cols[j]
corr_mat$corr_type[cnt] = method_used
}
}
corr_mat %>% ggplot(aes(var1, var2, fill=coef)) +
geom_tile() +
geom_text(aes(label=round(coef,2))) +
scale_fill_gradient(low="red", high="blue", limits=c(-1,1))+
theme( axis.text.x = element_text(angle = 70, vjust = 1, size = 12, hjust = 1),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank())